{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "369c3444",
   "metadata": {},
   "source": [
    "# Load JSON and Metadata filter JSON field or array"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6ffd11a",
   "metadata": {},
   "source": [
    "In this notebook, we are going to use Kaggle IMDB data, available as either raw JSON or CSV. We'll load it into Milvus vector database, then search with a metadata filter.\n",
    "\n",
    "Let's get started!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b2509fe9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# For colab install these libraries in this order:\n",
    "# !pip install numpy pandas torch pymilvus langchain transformers sentence-transformers \n",
    "# !pip install python-dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d7570b2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import common libraries.\n",
    "import sys, os, time, pprint, json\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# Import custom functions for splitting and search.\n",
    "sys.path.append(\"..\")  # Adds higher directory to python modules path.\n",
    "import milvus_utilities as _utils\n",
    "\n",
    "# Decide if you want to read JSON or CSV.\n",
    "DATA_TYPE = \"JSON\" # \"CSV\" or \"JSON\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "175c8133",
   "metadata": {},
   "source": [
    "## Read JSON data into a pandas dataframe\n",
    "\n",
    "The JSON data comes from https://www.kaggle.com/datasets/nelepie/imdb-genre-classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "36ee071f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df shape: (7, 10357)\n",
      "data is valid JSON.\n",
      "df shape: (100, 8)\n",
      "movie_index    object\n",
      "title          object\n",
      "description    object\n",
      "poster_url     object\n",
      "labels         object\n",
      "Genres         object\n",
      "film_year       int64\n",
      "text           object\n",
      "dtype: object\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_index</th>\n",
       "      <th>title</th>\n",
       "      <th>description</th>\n",
       "      <th>poster_url</th>\n",
       "      <th>labels</th>\n",
       "      <th>Genres</th>\n",
       "      <th>film_year</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tt6443346</td>\n",
       "      <td>Black Adam</td>\n",
       "      <td>Nearly 5,000 years after he was bestowed with ...</td>\n",
       "      <td>https://m.media-amazon.com/images/M/MV5BYzZkOG...</td>\n",
       "      <td>SuperHero</td>\n",
       "      <td>[Action, Adventure, Fantasy]</td>\n",
       "      <td>2022</td>\n",
       "      <td>Black Adam Nearly 5,000 years after he was bes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tt10954600</td>\n",
       "      <td>Ant-Man and the Wasp: Quantumania</td>\n",
       "      <td>Scott Lang and Hope Van Dyne, along with Hank ...</td>\n",
       "      <td>https://m.media-amazon.com/images/M/MV5BNDgyNG...</td>\n",
       "      <td>SuperHero</td>\n",
       "      <td>[Action, Adventure, Comedy]</td>\n",
       "      <td>2023</td>\n",
       "      <td>Ant-Man and the Wasp: Quantumania Scott Lang a...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  movie_index                              title  \\\n",
       "0   tt6443346                         Black Adam   \n",
       "1  tt10954600  Ant-Man and the Wasp: Quantumania   \n",
       "\n",
       "                                         description  \\\n",
       "0  Nearly 5,000 years after he was bestowed with ...   \n",
       "1  Scott Lang and Hope Van Dyne, along with Hank ...   \n",
       "\n",
       "                                          poster_url     labels  \\\n",
       "0  https://m.media-amazon.com/images/M/MV5BYzZkOG...  SuperHero   \n",
       "1  https://m.media-amazon.com/images/M/MV5BNDgyNG...  SuperHero   \n",
       "\n",
       "                         Genres  film_year  \\\n",
       "0  [Action, Adventure, Fantasy]       2022   \n",
       "1   [Action, Adventure, Comedy]       2023   \n",
       "\n",
       "                                                text  \n",
       "0  Black Adam Nearly 5,000 years after he was bes...  \n",
       "1  Ant-Man and the Wasp: Quantumania Scott Lang a...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Example text length: 240\n",
      "Example text: Black Adam Nearly 5,000 years after he was bestowed with the almighty powers of the Egyptian gods - and imprisoned just as quickly - Black Adam is freed from his earthly tomb, ready to unleash his unique form of justice on the modern world.\n"
     ]
    }
   ],
   "source": [
    "if DATA_TYPE == \"JSON\":\n",
    "\n",
    "    # Read some JSON data.\n",
    "    df = pd.read_json('data/parsed_data.json')\n",
    "    print(f\"df shape: {df.shape}\")\n",
    "    df = df.T\n",
    "\n",
    "    # Save top 100 rows as tiny json file\n",
    "    temp = df.head(100).copy()\n",
    "    temp.to_json('data/tiny_parsed_data.json')\n",
    "    df = pd.read_json('data/tiny_parsed_data.json')\n",
    "\n",
    "    # Reset index and call it movie_index.\n",
    "    df = df.reset_index().rename(columns={'index': 'movie_index'})\n",
    "\n",
    "    # Drop release date, it has too many nulls.\n",
    "    df.drop(columns=['releaseDate'], inplace=True)\n",
    "\n",
    "    # Reverse column names label, genres.\n",
    "    df = df.rename(columns={'labels': 'Genres'})\n",
    "    df = df.rename(columns={'genre': 'labels'})\n",
    "\n",
    "    # Convert year to a number.\n",
    "    df['film_year'] = df.film_year.astype(int)\n",
    "\n",
    "    # Concatenate Title and Description into 'text' column.\n",
    "    df['text'] = df['title'] + ' ' + df['description']\n",
    "\n",
    "    # Verify data is valid json.\n",
    "    try:\n",
    "        # Convert temp to a JSON string and try to parse it\n",
    "        json.loads(df.to_json())\n",
    "        print(\"data is valid JSON.\")\n",
    "    except json.JSONDecodeError:\n",
    "        print(\"data is not valid JSON.\")\n",
    "\n",
    "    print(f\"df shape: {df.shape}\")\n",
    "    print(df.dtypes)\n",
    "    display(df.head(2))\n",
    "\n",
    "    # Inspect text.\n",
    "    print(f\"Example text length: {len(df.text[0])}\")\n",
    "    print(f\"Example text: {df.text[0]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ab684ec",
   "metadata": {},
   "source": [
    "## Read CSV data into a pandas dataframe\n",
    "\n",
    "The data used in this notebook is [Kaggle 48K movies](https://www.kaggle.com/datasets/yashgupta24/48000-movies-dataset) which contains a lot of metadata in addition to the raw review text.\n",
    "\n",
    "Usually there is a data cleaning step.  Such as replace empty strings with \"\" or unusual and empty fields with median values.  Below, I'll just drop rows with null values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ecab0d57",
   "metadata": {},
   "outputs": [],
   "source": [
    "if DATA_TYPE == \"CSV\":\n",
    "\n",
    "    # Read CSV data.\n",
    "    df = pd.read_csv(\"data/kaggle_imdb_small.csv\")\n",
    "\n",
    "    # Concatenate 'Name', 'Keywords', and 'Description' into 'text' column\n",
    "    df['text'] = df['Name'] + ' ' + df['Description'] + ' ' + df['ReviewBody']\n",
    "\n",
    "    # Convert genres from string with commas in it to list of strings.\n",
    "    df['Genres'] = df['Genres'].str.split(',')\n",
    "\n",
    "    # Convert actors from string with commas in it to list of strings.\n",
    "    df['Actors'] = df['Actors'].str.split(',')\n",
    "\n",
    "    # Convert keywords from string with commas in it to list of strings.\n",
    "    df['Keywords'] = df['Keywords'].str.split(',')\n",
    "\n",
    "    # Extract out just the year from the date.\n",
    "    df['DatePublished'] = df.DatePublished.apply(lambda x: x.split('-')[0])\n",
    "    df['DatePublished'] = df.DatePublished.astype(int)\n",
    "\n",
    "    # Drop extra rating columns.\n",
    "    df.drop(columns=['RatingCount', 'BestRating', 'WorstRating'], inplace=True)\n",
    "\n",
    "    # Inspect text.\n",
    "    print(f\"Example text length: {len(df.text[0])}\")\n",
    "    pprint.pprint(f\"Example text: {df.text[0]}\")\n",
    "\n",
    "    print(df.dtypes)\n",
    "    display(df.head(2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb844837",
   "metadata": {},
   "source": [
    "## Start up a Zilliz free tier cluster.\n",
    "\n",
    "Code in this notebook uses fully-managed Milvus on [Ziliz Cloud free trial](https://cloud.zilliz.com/login).  \n",
    "  1. Choose the default \"Starter\" option when you provision > Create collection > Give it a name > Create cluster and collection.  \n",
    "  2. On the Cluster main page, copy your `API Key` and store it locally in a .env variable.  See note below how to do that.\n",
    "  3. Also on the Cluster main page, copy the `Public Endpoint URI`.\n",
    "\n",
    "💡 Note: To keep your tokens private, best practice is to use an **env variable**.  See [how to save api key in env variable](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). <br>\n",
    "\n",
    "👉🏼 In Jupyter, you need a .env file (in same dir as notebooks) containing lines like this:\n",
    "- ZILLIZ_API_KEY=f370c...\n",
    "- OPENAI_API_KEY=sk-H...\n",
    "- VARIABLE_NAME=value..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0806d2db",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type of server: Zilliz Cloud Vector Database(Compatible with Milvus 2.3)\n"
     ]
    }
   ],
   "source": [
    "# STEP 1. CONNECT TO ZILLIZ CLOUD\n",
    "\n",
    "# !pip install pymilvus #python sdk for milvus\n",
    "from pymilvus import connections, utility\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "TOKEN = os.getenv(\"ZILLIZ_API_KEY\")\n",
    "\n",
    "# Connect to Zilliz cloud using endpoint URI and API key TOKEN.\n",
    "# TODO change this.\n",
    "CLUSTER_ENDPOINT=\"https://in03-xxxx.api.gcp-us-west1.zillizcloud.com:443\"\n",
    "CLUSTER_ENDPOINT=\"https://in03-48a5b11fae525c9.api.gcp-us-west1.zillizcloud.com:443\"\n",
    "connections.connect(\n",
    "  alias='default',\n",
    "  #  Public endpoint obtained from Zilliz Cloud\n",
    "  uri=CLUSTER_ENDPOINT,\n",
    "  # API key or a colon-separated cluster username and password\n",
    "  token=TOKEN,\n",
    ")\n",
    "\n",
    "# Check if the server is ready and get collection name.\n",
    "print(f\"Type of server: {utility.get_server_version()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b01d6622",
   "metadata": {},
   "source": [
    "## Load the Embedding Model checkpoint and use it to create vector embeddings\n",
    "**Embedding model:**  We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) available on HuggingFace to encode the documentation text.  We will download the model from HuggingFace and run it locally. \n",
    "\n",
    "💡Tip:  A good way to choose a sentence transformer model is to check the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).  Sort descending by column \"Retrieval Average\" and choose the best-performing small model.\n",
    "\n",
    "Two model parameters of note below:\n",
    "1. EMBEDDING_DIM refers to the dimensionality or length of the embedding vector. In this case, the embeddings generated for EACH token in the input text will have the SAME length = 1024. This size of embedding is often associated with BERT-based models, where the embeddings are used for downstream tasks such as classification, question answering, or text generation. <br><br>\n",
    "2. MAX_SEQ_LENGTH is the maximum Context Length the encoder model can handle for input sequences. In this case, if sequences longer than 512 tokens are given to the model, everything longer will be (silently!) chopped off.  This is the reason why a chunking strategy is needed to segment input texts into chunks with lengths that will fit in the model's input."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dd2be7fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "device: cpu\n",
      "<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>\n",
      "SentenceTransformer(\n",
      "  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel \n",
      "  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})\n",
      ")\n",
      "model_name: WhereIsAI/UAE-Large-V1\n",
      "EMBEDDING_DIM: 1024\n",
      "MAX_SEQ_LENGTH: 512\n"
     ]
    }
   ],
   "source": [
    "# STEP 2. DOWNLOAD AN OPEN SOURCE EMBEDDING MODEL.\n",
    "\n",
    "# Import torch.\n",
    "import torch\n",
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "# Initialize torch settings\n",
    "torch.backends.cudnn.deterministic = True\n",
    "DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')\n",
    "print(f\"device: {DEVICE}\")\n",
    "\n",
    "# Load the model from huggingface model hub.\n",
    "model_name = \"WhereIsAI/UAE-Large-V1\"\n",
    "encoder = SentenceTransformer(model_name, device=DEVICE)\n",
    "print(type(encoder))\n",
    "print(encoder)\n",
    "\n",
    "# Get the model parameters and save for later.\n",
    "EMBEDDING_DIM = encoder.get_sentence_embedding_dimension()\n",
    "MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length() \n",
    "# # Assume tokens are 3 characters long.\n",
    "# MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3\n",
    "# HF_EOS_TOKEN_LENGTH = 1 * 3\n",
    "# Test with 512 sequence length.\n",
    "MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS\n",
    "HF_EOS_TOKEN_LENGTH = 1\n",
    "\n",
    "# Inspect model parameters.\n",
    "print(f\"model_name: {model_name}\")\n",
    "print(f\"EMBEDDING_DIM: {EMBEDDING_DIM}\")\n",
    "print(f\"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a Milvus collection\n",
    "\n",
    "You can think of a collection in Milvus like a \"table\" in SQL databases.  The **collection** will contain the \n",
    "- **Schema** (or [no-schema Milvus client](https://milvus.io/docs/using_milvusclient.md)).  \n",
    "💡 You'll need the vector `EMBEDDING_DIM` parameter from your embedding model.\n",
    "Typical values are:\n",
    "   - 1024 for sbert embedding models\n",
    "   - 1536 for ada-002 OpenAI embedding models\n",
    "- **Vector index** for efficient vector search\n",
    "- **Vector distance metric** for measuring nearest neighbor vectors\n",
    "- **Consistency level**\n",
    "In Milvus, transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed. 💡 Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.\n",
    "\n",
    "## Add a Vector Index\n",
    "\n",
    "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits.  \n",
    "\n",
    "Most vector indexes use different sets of parameters depending on whether the database is:\n",
    "- **inserting vectors** (creation mode) - vs - \n",
    "- **searching vectors** (search mode) \n",
    "\n",
    "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus.  For example:\n",
    "- FLAT - deterministic exhaustive search\n",
    "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n",
    "- HNSW - Graph index (stochastic approximate search)\n",
    "- AUTOINDEX - Automatically determined based on OSS vs [Zilliz cloud](https://docs.zilliz.com/docs/autoindex-explained), type of GPU, size of data.\n",
    "\n",
    "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space.  In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen.  Its possible distance metrics are one of:\n",
    "- L2 - L2-norm\n",
    "- IP - Dot-product\n",
    "- COSINE - Angular distance\n",
    "\n",
    "💡 Most use cases work better with normalized embeddings, in which case L2 is useless (every vector has length=1) and IP and COSINE are the same.  Only choose L2 if you plan to keep your embeddings unnormalized."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully created collection: `imdb_metadata`\n",
      "{'aliases': [],\n",
      " 'auto_id': True,\n",
      " 'collection_id': 448076879578126663,\n",
      " 'collection_name': 'imdb_metadata',\n",
      " 'consistency_level': 3,\n",
      " 'description': '',\n",
      " 'enable_dynamic_field': True,\n",
      " 'fields': [{'auto_id': True,\n",
      "             'description': '',\n",
      "             'field_id': 100,\n",
      "             'is_primary': True,\n",
      "             'name': 'id',\n",
      "             'params': {},\n",
      "             'type': <DataType.INT64: 5>},\n",
      "            {'description': '',\n",
      "             'field_id': 101,\n",
      "             'name': 'vector',\n",
      "             'params': {'dim': 1024},\n",
      "             'type': <DataType.FLOAT_VECTOR: 101>}],\n",
      " 'num_partitions': 1,\n",
      " 'num_shards': 1,\n",
      " 'properties': {}}\n"
     ]
    }
   ],
   "source": [
    "# STEP 3. CREATE A NO-SCHEMA MILVUS COLLECTION AND DEFINE THE DATABASE INDEX.\n",
    "\n",
    "from pymilvus import MilvusClient\n",
    "\n",
    "# Set the Milvus collection name.\n",
    "COLLECTION_NAME = \"imdb_metadata\"\n",
    "\n",
    "# Add custom HNSW search index to the collection.\n",
    "# M = max number graph connections per layer. Large M = denser graph.\n",
    "# Choice of M: 4~64, larger M for larger data and larger embedding lengths.\n",
    "M = 16\n",
    "# efConstruction = num_candidate_nearest_neighbors per layer. \n",
    "# Use Rule of thumb: int. 8~512, efConstruction = M * 2.\n",
    "efConstruction = M * 2\n",
    "# Create the search index for local Milvus server.\n",
    "INDEX_PARAMS = dict({\n",
    "    'M': M,               \n",
    "    \"efConstruction\": efConstruction })\n",
    "index_params = {\n",
    "    \"index_type\": \"HNSW\", \n",
    "    \"metric_type\": \"COSINE\", \n",
    "    \"params\": INDEX_PARAMS\n",
    "    }\n",
    "\n",
    "# Use no-schema Milvus client uses flexible json key:value format.\n",
    "# https://milvus.io/docs/using_milvusclient.md\n",
    "mc = MilvusClient(\n",
    "    uri=CLUSTER_ENDPOINT,\n",
    "    # API key or a colon-separated cluster username and password\n",
    "    token=TOKEN)\n",
    "\n",
    "# Check if collection already exists, if so drop it.\n",
    "has = utility.has_collection(COLLECTION_NAME)\n",
    "if has:\n",
    "    drop_result = utility.drop_collection(COLLECTION_NAME)\n",
    "    print(f\"Successfully dropped collection: `{COLLECTION_NAME}`\")\n",
    "\n",
    "# Create the collection.\n",
    "mc.create_collection(COLLECTION_NAME, \n",
    "                     EMBEDDING_DIM,\n",
    "                     consistency_level=\"Eventually\", \n",
    "                     auto_id=True,  \n",
    "                     overwrite=True,\n",
    "                     # skip setting params below, if using AUTOINDEX\n",
    "                     params=index_params\n",
    "                    )\n",
    "\n",
    "print(f\"Successfully created collection: `{COLLECTION_NAME}`\")\n",
    "pprint.pprint(mc.describe_collection(COLLECTION_NAME))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c60423a5",
   "metadata": {},
   "source": [
    "## Chunking\n",
    "\n",
    "Before embedding, it is necessary to decide your chunk strategy, chunk size, and chunk overlap.  In this demo, I will use:\n",
    "- **Strategy** = Keep movie reveiws as single chunks unless they are too long.\n",
    "- **Chunk size** = Use the embedding model's parameter `MAX_SEQ_LENGTH`\n",
    "- **Overlap** = Rule-of-thumb 10-15%\n",
    "- **Function** = Langchain's convenient `RecursiveCharacterTextSplitter` to split up long reviews recursively."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "006145b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# STEP 4. PREPARE DATA: CHUNK AND EMBED\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "def recursive_splitter_wrapper(text, chunk_size):\n",
    "\n",
    "    # Default chunk overlap is 10% chunk_size.\n",
    "    chunk_overlap = np.round(chunk_size * 0.10, 0)\n",
    "\n",
    "    # Use langchain's convenient recursive chunking method.\n",
    "    text_splitter = RecursiveCharacterTextSplitter(\n",
    "        chunk_size=chunk_size,\n",
    "        chunk_overlap=chunk_overlap,\n",
    "        length_function=len,\n",
    "    )\n",
    "    chunks: List[str] = text_splitter.split_text(text)\n",
    "\n",
    "    # Replace special characters with spaces.\n",
    "    chunks = [text.replace(\"<br /><br />\", \" \") for text in chunks]\n",
    "\n",
    "    return chunks\n",
    "\n",
    "# Use recursive splitter to chunk text.\n",
    "def imdb_chunk_text(batch_size, df, chunk_size):\n",
    "\n",
    "    batch = df.head(batch_size).copy()\n",
    "    print(f\"chunk size: {chunk_size}\")\n",
    "    print(f\"original shape: {batch.shape}\")\n",
    "    \n",
    "    start_time = time.time()\n",
    "\n",
    "    # 1. Chunk the text review into chunk_size.\n",
    "    batch['chunk'] = batch['text'].apply(recursive_splitter_wrapper, chunk_size=chunk_size)\n",
    "    # Explode the 'chunk' column to create new rows for each chunk.\n",
    "    batch = batch.explode('chunk', ignore_index=True)\n",
    "    print(f\"new shape: {batch.shape}\")\n",
    "\n",
    "    # 2. Add embeddings as new column in df.\n",
    "    embeddings = torch.tensor(encoder.encode(batch['chunk']))\n",
    "    # Normalize the embeddings.\n",
    "    embeddings = np.array(embeddings / np.linalg.norm(embeddings))\n",
    "\n",
    "    # 3. Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.\n",
    "    converted_values = list(map(np.float32, embeddings))\n",
    "    batch['vector'] = converted_values\n",
    "\n",
    "    end_time = time.time()\n",
    "    print(f\"Chunking + embedding time for {batch_size} docs: {end_time - start_time} sec\")\n",
    "    # Inspect the batch of data.\n",
    "    assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1\n",
    "    assert len(batch.vector[0]) == EMBEDDING_DIM\n",
    "    print(f\"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}\")\n",
    "    print(f\"of numbers: {type(batch.vector[0][0])}\")\n",
    "\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "843bb7e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chunk size: 512\n",
      "original shape: (100, 8)\n",
      "new shape: (100, 9)\n",
      "Chunking + embedding time for 100 docs: 6.592205047607422 sec\n",
      "type embeddings: <class 'pandas.core.series.Series'> of <class 'numpy.ndarray'>\n",
      "of numbers: <class 'numpy.float32'>\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_index</th>\n",
       "      <th>title</th>\n",
       "      <th>description</th>\n",
       "      <th>poster_url</th>\n",
       "      <th>labels</th>\n",
       "      <th>Genres</th>\n",
       "      <th>film_year</th>\n",
       "      <th>text</th>\n",
       "      <th>chunk</th>\n",
       "      <th>vector</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tt6443346</td>\n",
       "      <td>Black Adam</td>\n",
       "      <td>Nearly 5,000 years after he was bestowed with ...</td>\n",
       "      <td>https://m.media-amazon.com/images/M/MV5BYzZkOG...</td>\n",
       "      <td>SuperHero</td>\n",
       "      <td>[Action, Adventure, Fantasy]</td>\n",
       "      <td>2022</td>\n",
       "      <td>Black Adam Nearly 5,000 years after he was bes...</td>\n",
       "      <td>Black Adam Nearly 5,000 years after he was bes...</td>\n",
       "      <td>[0.0040127905, 0.001754614, -0.0010465175, 0.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tt10954600</td>\n",
       "      <td>Ant-Man and the Wasp: Quantumania</td>\n",
       "      <td>Scott Lang and Hope Van Dyne, along with Hank ...</td>\n",
       "      <td>https://m.media-amazon.com/images/M/MV5BNDgyNG...</td>\n",
       "      <td>SuperHero</td>\n",
       "      <td>[Action, Adventure, Comedy]</td>\n",
       "      <td>2023</td>\n",
       "      <td>Ant-Man and the Wasp: Quantumania Scott Lang a...</td>\n",
       "      <td>Ant-Man and the Wasp: Quantumania Scott Lang a...</td>\n",
       "      <td>[0.008013694, -0.001968402, 0.0013089634, 0.00...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  movie_index                              title  \\\n",
       "0   tt6443346                         Black Adam   \n",
       "1  tt10954600  Ant-Man and the Wasp: Quantumania   \n",
       "\n",
       "                                         description  \\\n",
       "0  Nearly 5,000 years after he was bestowed with ...   \n",
       "1  Scott Lang and Hope Van Dyne, along with Hank ...   \n",
       "\n",
       "                                          poster_url     labels  \\\n",
       "0  https://m.media-amazon.com/images/M/MV5BYzZkOG...  SuperHero   \n",
       "1  https://m.media-amazon.com/images/M/MV5BNDgyNG...  SuperHero   \n",
       "\n",
       "                         Genres  film_year  \\\n",
       "0  [Action, Adventure, Fantasy]       2022   \n",
       "1   [Action, Adventure, Comedy]       2023   \n",
       "\n",
       "                                                text  \\\n",
       "0  Black Adam Nearly 5,000 years after he was bes...   \n",
       "1  Ant-Man and the Wasp: Quantumania Scott Lang a...   \n",
       "\n",
       "                                               chunk  \\\n",
       "0  Black Adam Nearly 5,000 years after he was bes...   \n",
       "1  Ant-Man and the Wasp: Quantumania Scott Lang a...   \n",
       "\n",
       "                                              vector  \n",
       "0  [0.0040127905, 0.001754614, -0.0010465175, 0.0...  \n",
       "1  [0.008013694, -0.001968402, 0.0013089634, 0.00...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "## Chunk and Embed Text Data\n",
    "\n",
    "# Use the embedding model parameters.\n",
    "# chunk_size = MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH\n",
    "chunk_size = 512\n",
    "chunk_overlap = np.round(chunk_size * 0.10, 0)\n",
    "\n",
    "# Chunk a batch of data from pandas DataFrame and inspect it.\n",
    "BATCH_SIZE = 100\n",
    "batch = imdb_chunk_text(BATCH_SIZE, df, chunk_size)\n",
    "display(batch.head(2))\n",
    "\n",
    "# Drop the original text column, keep the new 'chunk' column.\n",
    "batch.drop(columns=['text'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9bd8153",
   "metadata": {},
   "source": [
    "## Insert data into Milvus\n",
    "\n",
    "For each original text chunk, we'll write the quadruplet (`vector, text, source, h1, h2`) into the database.\n",
    "\n",
    "<div>\n",
    "<img src=\"../../images/db_insert.png\" width=\"80%\"/>\n",
    "</div>\n",
    "\n",
    "**The Milvus Client wrapper can only handle loading data from a list of dictionaries.**\n",
    "\n",
    "Otherwise, in general, Milvus supports loading data from:\n",
    "- pandas dataframes \n",
    "- list of dictionaries\n",
    "\n",
    "Below, we use the embedding model provided by HuggingFace, download its checkpoint, and run it locally as the encoder.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start inserting entities\n",
      "Milvus Client insert time for 100 vectors: 0.36271023750305176 seconds\n"
     ]
    }
   ],
   "source": [
    "# STEP 5. INSERT CHUNKS AND EMBEDDINGS IN ZILLIZ.\n",
    "\n",
    "# Convert the DataFrame to a list of dictionaries\n",
    "chunk_list = batch.to_dict(orient='records')\n",
    "\n",
    "# Insert data into the Milvus collection.\n",
    "print(\"Start inserting entities\")\n",
    "start_time = time.time()\n",
    "insert_result = mc.insert(\n",
    "    COLLECTION_NAME,\n",
    "    data=chunk_list,\n",
    "    progress_bar=True)\n",
    "end_time = time.time()\n",
    "print(f\"Milvus Client insert time for {batch.shape[0]} vectors: {end_time - start_time} seconds\")\n",
    "\n",
    "# Milvus Client does an automatic flush, save data to S3.\n",
    "\n",
    "# CSV: Milvus Client insert time for 456 vectors: 8.96166205406189 seconds\n",
    "# JSON: Milvus Client insert time for 100 vectors: 0.3610990047454834 seconds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "66ae14dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'movie_index': 'tt6443346',\n",
       " 'title': 'Black Adam',\n",
       " 'description': 'Nearly 5,000 years after he was bestowed with the almighty powers of the Egyptian gods - and imprisoned just as quickly - Black Adam is freed from his earthly tomb, ready to unleash his unique form of justice on the modern world.',\n",
       " 'poster_url': 'https://m.media-amazon.com/images/M/MV5BYzZkOGUwMzMtMTgyNS00YjFlLTg5NzYtZTE3Y2E5YTA5NWIyXkEyXkFqcGdeQXVyMjkwOTAyMDU@._V1_QL75_UX190_CR0,0,190,281_.jpg',\n",
       " 'labels': 'SuperHero',\n",
       " 'Genres': ['Action', 'Adventure', 'Fantasy'],\n",
       " 'film_year': 2022,\n",
       " 'chunk': 'Black Adam Nearly 5,000 years after he was bestowed with the almighty powers of the Egyptian gods - and imprisoned just as quickly - Black Adam is freed from his earthly tomb, ready to unleash his unique form of justice on the modern world.',\n",
       " 'vector': array([ 0.00401279,  0.00175461, -0.00104652, ..., -0.00563261,\n",
       "        -0.00292824,  0.0038763 ], dtype=float32)}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# TODO - Uncomment to print a single row.\n",
    "chunk_list = batch.to_dict(orient='records')\n",
    "chunk_list[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8d7b4766",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'aliases': [],\n",
      " 'auto_id': True,\n",
      " 'collection_id': 448076879578126663,\n",
      " 'collection_name': 'imdb_metadata',\n",
      " 'consistency_level': 3,\n",
      " 'description': '',\n",
      " 'enable_dynamic_field': True,\n",
      " 'fields': [{'auto_id': True,\n",
      "             'description': '',\n",
      "             'field_id': 100,\n",
      "             'is_primary': True,\n",
      "             'name': 'id',\n",
      "             'params': {},\n",
      "             'type': <DataType.INT64: 5>},\n",
      "            {'description': '',\n",
      "             'field_id': 101,\n",
      "             'name': 'vector',\n",
      "             'params': {'dim': 1024},\n",
      "             'type': <DataType.FLOAT_VECTOR: 101>}],\n",
      " 'num_partitions': 1,\n",
      " 'num_shards': 1,\n",
      " 'properties': {}}\n",
      "timing: 0.037996768951416016 seconds\n",
      "\n",
      "[{'count(*)': 100}]\n",
      "timing: 0.12284588813781738 seconds\n"
     ]
    }
   ],
   "source": [
    "# Example PyMilvus utility API calls.\n",
    "\n",
    "# # Count rows, incurs a call to .flush() first.\n",
    "# This API call is not supported by Milvus Client.\n",
    "# print(f\"Count rows: {mc.num_entities(COLLECTION_NAME)}\")\n",
    "\n",
    "# View collection info, incurs a call to .flush() first.\n",
    "start_time = time.time()\n",
    "pprint.pprint(mc.describe_collection(COLLECTION_NAME))\n",
    "end_time = time.time()\n",
    "print(f\"timing: {end_time - start_time} seconds\")\n",
    "print()\n",
    "\n",
    "# Count rows without incurring call to .flush().\n",
    "start_time = time.time()\n",
    "res = mc.query( collection_name=COLLECTION_NAME, \n",
    "               filter=\"\", \n",
    "               output_fields = [\"count(*)\"], )\n",
    "pprint.pprint(res)\n",
    "end_time = time.time()\n",
    "print(f\"timing: {end_time - start_time} seconds\")\n",
    "\n",
    "# # View rows without incurring call to .flush().\n",
    "# OUTPUT_FIELDS = ['movie_index', 'title', 'description',\n",
    "#   'poster_url', 'labels', 'Genres', 'film_year', 'chunk']\n",
    "# res = mc.query( collection_name=COLLECTION_NAME, \n",
    "#                filter=\"id >= 0\", \n",
    "#                output_fields = OUTPUT_FIELDS, )\n",
    "# pprint.pprint(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c022c38a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['movie_index',\n",
       " 'title',\n",
       " 'description',\n",
       " 'poster_url',\n",
       " 'labels',\n",
       " 'Genres',\n",
       " 'film_year',\n",
       " 'chunk']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Define metadata fields you can filter on.\n",
    "OUTPUT_FIELDS = list(df.columns)\n",
    "# rename text to chunk\n",
    "OUTPUT_FIELDS[OUTPUT_FIELDS.index('text')] = 'chunk'\n",
    "OUTPUT_FIELDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "129bc5bb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Action',\n",
       " 'Adventure',\n",
       " 'Crime',\n",
       " 'Horror',\n",
       " 'Sci-Fi',\n",
       " 'Thriller',\n",
       " 'Drama',\n",
       " 'Mystery',\n",
       " 'Comedy',\n",
       " 'Animation',\n",
       " 'Fantasy']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# List distinct genres\n",
    "GENRES = list(set([genre for genres in df['Genres'] for genre in genres]))\n",
    "GENRES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e84931bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "if DATA_TYPE == \"CSV\":\n",
    "    # Plot histogram of rating values.\n",
    "    import matplotlib.pyplot as plt\n",
    "    plt.figure(figsize=(4, 2))\n",
    "    df['RatingValue'].hist();\n",
    "    # Scale of 1-10.  Popular movies are rated >= 7."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02c589ff",
   "metadata": {},
   "source": [
    "## Ask a question about your data\n",
    "\n",
    "So far in this demo notebook: \n",
    "1. Your custom data has been mapped into a vector embedding space\n",
    "2. Those vector embeddings have been saved into a vector database\n",
    "\n",
    "Next, you can ask a question about your custom data!\n",
    "\n",
    "💡 In LLM vocabulary:\n",
    "> **Query** is the generic term for user questions.  \n",
    "A query is a list of multiple individual questions, up to maybe 1000 different questions!\n",
    "\n",
    "> **Question** usually refers to a single user question.  \n",
    "In our example below, the user question is \"What is AUTOINDEX in Milvus Client?\"\n",
    "\n",
    "> **Semantic Search** = very fast search of the entire knowledge base to find the `TOP_K` documentation chunks with the closest embeddings to the user's query.\n",
    "\n",
    "💡 The same model should always be used for consistency for all the embeddings data and the query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5e7f41f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query length: 45\n"
     ]
    }
   ],
   "source": [
    "# Define a sample question about your data.\n",
    "\n",
    "# These 2 questions are for the CSV dataset.\n",
    "QUESTION1 = \"I'm a medical doctor of Lou Gehrig's disease.\"\n",
    "QUESTION2 = \"Bollywood\"\n",
    "\n",
    "# This question for the JSON dataset.\n",
    "QUESTION3 = \"Dystopia science fiction with a robot.\"\n",
    "\n",
    "# Inspect the length of the query.\n",
    "QUERY_LENGTH = len(QUESTION1)\n",
    "print(f\"query length: {QUERY_LENGTH}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SELECT A PARTICULAR QUESTION TO ASK.\n",
    "\n",
    "SAMPLE_QUESTION = QUESTION3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ea29411",
   "metadata": {},
   "source": [
    "## Execute a vector search\n",
    "\n",
    "Search Milvus using [PyMilvus API](https://milvus.io/docs/search.md).\n",
    "\n",
    "💡 By their nature, vector searches are \"semantic\" searches.  For example, if you were to search for \"leaky faucet\": \n",
    "> **Traditional Key-word Search** - either or both words \"leaky\", \"faucet\" would have to match some text in order to return a web page or link text to the document.\n",
    "\n",
    "> **Semantic search** - results containing words \"drippy\" \"taps\" would be returned as well because these words mean the same thing even though they are different words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9673ce4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mc_run_search(question, filter_expression):\n",
    "    # Embed the question using the same encoder.\n",
    "    query_embeddings = _utils.embed_query(encoder, [question])\n",
    "    TOP_K = 2\n",
    "\n",
    "    # Return top k results with HNSW index.\n",
    "    SEARCH_PARAMS = dict({\n",
    "        # Re-use index param for num_candidate_nearest_neighbors.\n",
    "        \"ef\": INDEX_PARAMS['efConstruction']\n",
    "    })\n",
    "\n",
    "    # Run semantic vector search using your query and the vector database.\n",
    "    results = mc.search(\n",
    "        COLLECTION_NAME,\n",
    "        data=query_embeddings, \n",
    "        search_params=SEARCH_PARAMS,\n",
    "        output_fields=OUTPUT_FIELDS, \n",
    "        # Milvus can utilize metadata in boolean expressions to filter search.\n",
    "        filter=filter_expression,\n",
    "        limit=TOP_K,\n",
    "        consistency_level=\"Eventually\"\n",
    "    )\n",
    "\n",
    "    # Assemble retrieved context and context metadata.\n",
    "    # The search result is in the variable `results[0]`, which is type \n",
    "    # 'pymilvus.orm.search.SearchResult'. \n",
    "    METADATA_FIELDS = [f for f in OUTPUT_FIELDS if f != 'chunk']\n",
    "    formatted_results, context, context_metadata = _utils.client_assemble_retrieved_context(\n",
    "        results, metadata_fields=METADATA_FIELDS, num_shot_answers=TOP_K)\n",
    "    \n",
    "    return formatted_results, context, context_metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e25ccac6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filter: json_contains(Genres, \"Sci-Fi\") and film_year < 2019\n",
      "Milvus Client search time for 100 vectors: 0.3123598098754883 seconds\n",
      "type: <class 'list'>, count: 2\n"
     ]
    }
   ],
   "source": [
    "# Run the search.\n",
    "\n",
    "# Metadata filters for CSV dataset.\n",
    "# expression = \"RatingValue >= 7\"\n",
    "\n",
    "# Metadata filters for JSON dataset.\n",
    "# expression = 'film_year >= 2019'\n",
    "expression = 'json_contains(Genres, \"Sci-Fi\") and film_year < 2019'\n",
    "print(f\"filter: {expression}\")\n",
    "\n",
    "start_time = time.time()\n",
    "formatted_results, context, context_metadata = \\\n",
    "    mc_run_search(SAMPLE_QUESTION, expression)\n",
    "elapsed_time = time.time() - start_time\n",
    "print(f\"Milvus Client search time for {len(chunk_list)} vectors: {elapsed_time} seconds\")\n",
    "\n",
    "# Inspect search result.\n",
    "print(f\"type: {type(formatted_results)}, count: {len(formatted_results)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "bb53d3cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieved result #1\n",
      "distance = 0.5802159309387207\n",
      "movie_index: tt0434409\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<img src=\"https://m.media-amazon.com/images/M/MV5BOTI5ODc3NzExNV5BMl5BanBnXkFtZTcwNzYxNzQzMw@@._V1_QL75_UX190_CR0,0,190,281_.jpg\" width=\"150\" height=\"200\"/>"
      ],
      "text/plain": [
       "<IPython.core.display.Image object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Chunk text: V for Vendetta In a future British dystopian society, a shadowy '\n",
      " 'freedom fighter, known only by the alias of \"V\", plots to overthrow the '\n",
      " 'tyrannical government - with the help of a young woman.')\n",
      "movie_index: tt0434409\n",
      "title: V for Vendetta\n",
      "description: In a future British dystopian society, a shadowy freedom fighter, known only by the alias of \"V\", plots to overthrow the tyrannical government - with the help of a young woman.\n",
      "poster_url: https://m.media-amazon.com/images/M/MV5BOTI5ODc3NzExNV5BMl5BanBnXkFtZTcwNzYxNzQzMw@@._V1_QL75_UX190_CR0,0,190,281_.jpg\n",
      "labels: SuperHero\n",
      "Genres: ['Action', 'Drama', 'Sci-Fi']\n",
      "film_year: 2005\n",
      "\n",
      "Retrieved result #2\n",
      "distance = 0.5398463606834412\n",
      "movie_index: tt0489099\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<img src=\"https://m.media-amazon.com/images/M/MV5BMjEwOTkyOTI3M15BMl5BanBnXkFtZTcwNTQxMjU1MQ@@._V1_QL75_UX190_CR0,0,190,281_.jpg\" width=\"150\" height=\"200\"/>"
      ],
      "text/plain": [
       "<IPython.core.display.Image object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Chunk text: Jumper A teenager with teleportation abilities suddenly finds '\n",
      " 'himself in the middle of an ancient war between those like him and their '\n",
      " 'sworn annihilators.')\n",
      "movie_index: tt0489099\n",
      "title: Jumper\n",
      "description: A teenager with teleportation abilities suddenly finds himself in the middle of an ancient war between those like him and their sworn annihilators.\n",
      "poster_url: https://m.media-amazon.com/images/M/MV5BMjEwOTkyOTI3M15BMl5BanBnXkFtZTcwNTQxMjU1MQ@@._V1_QL75_UX190_CR0,0,190,281_.jpg\n",
      "labels: SuperHero\n",
      "Genres: ['Action', 'Adventure', 'Sci-Fi']\n",
      "film_year: 2008\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Display poster link.\n",
    "from IPython.display import Image\n",
    "from IPython.display import display\n",
    "\n",
    "# Loop through recommended movies, display poster, print metadata.\n",
    "seen_movies = []\n",
    "for i in range(len(context)):\n",
    "    print(f\"Retrieved result #{i+1}\")\n",
    "    print(f\"distance = {formatted_results[i][0]}\")\n",
    "    # Get the movie_index\n",
    "    movie_index = context_metadata[i]['movie_index']\n",
    "    print(f\"movie_index: {movie_index}\")\n",
    "\n",
    "    # Don't display the same movie_index twice.\n",
    "    if movie_index in seen_movies:\n",
    "        continue\n",
    "    else:\n",
    "        seen_movies.append(movie_index)\n",
    "        # Display the first poster link as a rendered image\n",
    "        if DATA_TYPE == \"CSV\":\n",
    "            x = Image(url = context_metadata[i]['PosterLink'], width=150, height=200) \n",
    "        elif DATA_TYPE == \"JSON\":\n",
    "            x = Image(url = context_metadata[i]['poster_url'], width=150, height=200) \n",
    "        display(x)\n",
    "\n",
    "        # Print the rest of the movie info.\n",
    "        pprint.pprint(f\"Chunk text: {context[i]}\")\n",
    "        # print metadata except the movie_index and poster link.\n",
    "        if DATA_TYPE == \"CSV\":\n",
    "            for key, value in context_metadata[i].items():\n",
    "                if ((key != 'PosterLink') or (key != 'movie_index')):\n",
    "                    print(f\"{key}: {value}\")\n",
    "            print()\n",
    "        elif DATA_TYPE == \"JSON\":\n",
    "            for key, value in context_metadata[i].items():\n",
    "                if ((key != 'poster_url') or (key != 'movie_index')):\n",
    "                    print(f\"{key}: {value}\")\n",
    "            print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "6294947f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop collection\n",
    "utility.drop_collection(COLLECTION_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "c777937e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Author: Christy Bergman\n",
      "\n",
      "Python implementation: CPython\n",
      "Python version       : 3.11.8\n",
      "IPython version      : 8.22.2\n",
      "\n",
      "torch                : 2.2.1\n",
      "transformers         : 4.39.1\n",
      "sentence_transformers: 2.6.0\n",
      "pymilvus             : 2.4.0\n",
      "langchain            : 0.1.13\n",
      "\n",
      "conda environment: py311\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Props to Sebastian Raschka for this handy watermark.\n",
    "# !pip install watermark\n",
    "\n",
    "%load_ext watermark\n",
    "%watermark -a 'Christy Bergman' -v -p torch,transformers,sentence_transformers,pymilvus,langchain --conda"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
